This notebook is the PokemonGo project for the course ‘Customer Analytics’

Team


~~~~

Preparation

~~~~

Import the libraries needed

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(tidyr)
library(caTools)

Import and read the data

# setwd("~/Desktop/MSc Data Analytics & AI/Semester 2 /Customer Analytics/Pokemon Go case")
setwd("C:/Users/Charles-Louis/Desktop/MSc/2 - Customer Analytics/PokemonGo")


# 1 -   Customerdata: contains player-related information 
customer = read.csv("customerdata.csv" )

# 2 - Summerfintrx & Fallfintrx: contain transaction-related information
summer_trx = read.csv("summerfintrx.csv" )
fall_trx = read.csv("fallfintrx.csv"   )

# 3 - SummersessTRX and FallsessTRX: contain session-related information
fall_sess   = read.csv("fallsesstrx.csv"  )
summer_sess = read.csv("summersesstrx.csv")

Convert Date fields into Date format

customer$Registrationdate = as.Date(customer$Registrationdate,"%Y-%m-%d")
summer_sess$Date = as.Date(summer_sess$Date,"%Y-%m-%d")
fall_sess$Date = as.Date(fall_sess$Date,"%Y-%m-%d")
summer_trx$Date = as.Date(summer_trx$Date,"%Y-%m-%d")
fall_trx$Date = as.Date(fall_trx$Date,"%Y-%m-%d")

Quick analysis of the customer table with a summary

summary(customer)
##        X          CustomerID    CustomerType   Registrationdate    
##  Min.   :   1   Min.   :   1   Min.   :1.000   Min.   :2016-07-01  
##  1st Qu.:1251   1st Qu.:1251   1st Qu.:1.000   1st Qu.:2016-12-08  
##  Median :2500   Median :2500   Median :2.000   Median :2018-01-01  
##  Mean   :2500   Mean   :2500   Mean   :2.497   Mean   :2017-08-13  
##  3rd Qu.:3750   3rd Qu.:3750   3rd Qu.:4.000   3rd Qu.:2018-03-02  
##  Max.   :5000   Max.   :5000   Max.   :4.000   Max.   :2018-04-30  
##       Sex              Age          fallbonus       Income     
##  Min.   :0.0000   Min.   : 6.00   Min.   :0.0   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:18.00   1st Qu.:0.0   1st Qu.:1.000  
##  Median :0.0000   Median :27.00   Median :0.0   Median :2.000  
##  Mean   :0.4038   Mean   :27.51   Mean   :0.2   Mean   :2.011  
##  3rd Qu.:1.0000   3rd Qu.:33.00   3rd Qu.:0.0   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :80.00   Max.   :1.0   Max.   :3.000

~~

Assignment 1 (3 points) - Creation of a basetable

~~

1) Create a basetable containing the active customers (at least 1 play session during the summer period). Indicate whether they received the fall boost package discount or not.

#We need to create a basetable with only active customer from the summer ( it's why I didn't merge summer and fall transaction/ session )

basetable_summer = summer_sess %>% select(CustomerID) %>% distinct(CustomerID)
basetable_summer = basetable_summer %>%
  left_join(customer, by = 'CustomerID') %>%
  select(-X) %>% 
  arrange(CustomerID)
basetable_summer

2) Calculate the demographics and RFM metrics for the relevant play and financial transactions database.

2.1) RFM metrics

We assign monetary values to each ProductID in transaction dataframes

summer_trx<- summer_trx %>% mutate(Value = c(2.99, 4.99, 9.99, 25, 99)[ProductID])
fall_trx <- fall_trx %>% mutate(Value = c(2.99, 4.99, 9.99, 25, 99)[ProductID])

We define the end date for the period ( here we are studying summer)

enddate_summer <- as.Date("2018-08-31","%Y-%m-%d")

We are going to evaluate the frequency and recency based on the sessions, then we will evaluate the monetary value based on the transaction dataset. We also calculate the Recency and Frequency of transactions as additional information.

summer_sess_RF <- summer_sess %>% group_by(CustomerID) %>% 
                      summarise(Recency = as.numeric(enddate_summer-max(Date)), 
                                Frequency = n() ) %>% 
                      ungroup()

summer_trx_RFM <- summer_trx %>% group_by(CustomerID) %>% 
                            summarise(Recency_Transac = as.numeric(enddate_summer-max(Date)), 
                                      Frequency_Transac = n(), 
                                      Monetary = sum(Value)) %>% 
                            ungroup()

Here we get the RFM for each summer user (PAID or NOT)

summer_RFM = merge(summer_sess_RF,summer_trx_RFM,by="CustomerID",all.x=TRUE)

summer_RFM[is.na(summer_RFM$Monetary),"Monetary"] <- 0
summer_RFM[is.na(summer_RFM$Frequency_Transac),"Frequency_Transac"] <- 0

summary(summer_RFM)
##    CustomerID      Recency         Frequency      Recency_Transac 
##  Min.   :   1   Min.   :  0.00   Min.   : 1.000   Min.   :  0.00  
##  1st Qu.:1252   1st Qu.:  8.00   1st Qu.: 2.000   1st Qu.: 26.00  
##  Median :2506   Median : 21.00   Median : 4.000   Median : 54.00  
##  Mean   :2502   Mean   : 29.82   Mean   : 4.243   Mean   : 56.27  
##  3rd Qu.:3751   3rd Qu.: 43.00   3rd Qu.: 6.000   3rd Qu.: 85.00  
##  Max.   :5000   Max.   :121.00   Max.   :18.000   Max.   :121.00  
##                                                   NA's   :3006    
##  Frequency_Transac    Monetary      
##  Min.   :0.000     Min.   :  0.000  
##  1st Qu.:0.000     1st Qu.:  0.000  
##  Median :0.000     Median :  0.000  
##  Mean   :0.473     Mean   :  4.179  
##  3rd Qu.:1.000     3rd Qu.:  4.990  
##  Max.   :5.000     Max.   :200.990  
## 
summer_RFM[is.na(summer_RFM$Recency_Transac),"Recency_Transac"] <- -1

summer_RFM

2.2) Demographics metrics

Here we are joining the RFM table with the customer informations

basetable_summer = summer_RFM %>%
  left_join(customer, by = 'CustomerID') %>%
  select(-X)
basetable_summer

Here we join the previous information with the customer session information and the number of transactions

summer_sess_cust_stat <- summer_sess %>% group_by(CustomerID) %>% select(-c(X, PlayID, Date)) %>%
                          summarise_all(funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
summer_sess_cust_nb <- summer_sess %>% group_by(CustomerID) %>% summarise(NbSess = n())

summer_trx_cust_stat <- summer_trx %>% group_by(CustomerID,ProductID) %>% summarise(NbTrx = n()) %>% 
                        mutate(ProductID = paste("NbTrxProd",ProductID,sep="")) %>% 
                        spread(ProductID,NbTrx)

summer_trx_cust_nb <- summer_trx %>% group_by(CustomerID) %>% summarise(NbTrx = n())
basetable_summer = basetable_summer %>%
                    left_join(summer_sess_cust_stat, by = 'CustomerID') %>%
                    left_join(summer_sess_cust_nb, by = 'CustomerID') %>%
                    left_join(summer_trx_cust_stat, by = 'CustomerID') %>%
                    left_join(summer_trx_cust_nb, by = 'CustomerID') 

basetable_summer[is.na(basetable_summer)] <- 0

basetable_summer

Based on these metrics, sketch a general profile (use the correct descriptive metric for each variable) of the customer base according to demographics, spending and usage transactions.

basetable_summer %>% summarise_all(funs(mean))
basetable_summer %>% group_by(CustomerType) %>% summarise_all(funs(mean))

Calculate the customer life time value for these customers. You can make assumptions for unknown variables (e.g., discount rate, # periods in the future), but motivate your assumptions clearly in the report.

Quick calculation of the transaction churn rate

cbind(summer_trx %>% summarise(Summer_Clients = n_distinct(CustomerID)),
      fall_trx %>% filter(CustomerID %in% unique(summer_trx$CustomerID)) %>% 
                    summarise(Fall_NotChurners = n_distinct(CustomerID))
      ) %>% mutate(ChurnRate = 1-Fall_NotChurners/Summer_Clients)

We create the CLV calculation formula

calc_clv<-function(margin,r,d,acquisition,t)
{
  clv<- -acquisition
  for(i in 0:t)#attention: start in year 0
  {
    clv<-clv+((r^i)*margin/(1+d)^i)
  }
  return (clv)
}

Assumptions : - acquisition = 0 We don’t have Acquisition Cost - margin = Monetary We use the monetary value of each user - r (Retention probability) = average churn = 0.657
- t (Time) = 4 Corresponds to 4 seasons = 1 year (churn is quarterly) - d (Discount Rate) = 0,1 Quarterly discount rate

basetable_summer$clv <- apply(basetable_summer[,c("Recency","Frequency","Monetary")],1,
                              function(x)calc_clv(x[3],0.657,0.1,0,4))

Now we have a new variable which is the CLV for each user.

basetable_summer[,c("CustomerID","Duration","clv")]

NB : We can also create a CLV based on time spent on the game. The churn rate needs to be calculated on session tables and the margin is now the time spent

Sessions average churn

cbind(summer_sess %>% summarise(Summer_Users = n_distinct(CustomerID)),
      fall_sess %>% filter(CustomerID %in% unique(summer_trx$CustomerID)) %>% 
                    summarise(Fall_NotChurners = n_distinct(CustomerID))
      ) %>% mutate(ChurnRate = 1-Fall_NotChurners/Summer_Users)
basetable_summer$clv_time <- apply(basetable_summer[,c("Recency","Frequency","Duration")],1,
                              function(x)calc_clv(x[3],0.657,0.1,0,4))

basetable_summer[,c("CustomerID","Duration","clv_time")]

~~~~

FOR THE REPORT : EXPLORATORY ANALYSIS OF THE BASETABLE

~~~~

ggplot(basetable_summer, aes(x=Frequency, colour=CustomerType)) +
  theme_bw() +
  scale_x_continuous(breaks=c(1:30)) +
  geom_bar(alpha=0.6) +
  ggtitle("Distribution by Frequency")

ggplot(basetable_summer, aes(x=Recency, colour=CustomerType)) +
  theme_bw() +
  geom_bar(alpha=0.6) +
  ggtitle("Distribution by Recency")

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Monetary, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Age, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Pokestops, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Experience, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Gyms, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Raids, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Social, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Pokemons, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = Distance, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)

ggplot(basetable_summer)+ aes(x = CustomerType, y = clv, colour=CustomerType) +

  theme_bw() +
  geom_boxplot()

~~~~

Assignment 2 (3,5 points) - Lifecycle grids

~~~~

Here we replace continous value per categorical value for Life Cycle Grid

basetable_summer <-  basetable_summer %>%
  mutate(segm.freq=ifelse(between(Frequency, 1, 1), '1',
                          ifelse(between(Frequency, 2, 2), '2',
                                 ifelse(between(Frequency, 3, 3), '3',
                                        ifelse(between(Frequency, 4, 4), '4',
                                               ifelse(between(Frequency, 5, 5), '5','>5')))))) %>%
  mutate(segm.rec=ifelse(between(Recency, 0, 5), '0-5 days',
                         ifelse(between(Recency, 6, 10), '6-10 days',
                                ifelse(between(Recency, 11, 20), '11-20 days',
                                       ifelse(between(Recency, 21, 30), '21-30 days',
                                              ifelse(between(Recency, 31, 50), '31-50 days', '>50 days')))))) %>% 
  mutate(Sex = ifelse(Sex==1,"Women","Men")) 

# spliting into discrete groups with levels to make & identify grids later on
basetable_summer$segm.freq <- factor(basetable_summer$segm.freq, levels=c('>5', '5', '4', '3', '2', '1'))
basetable_summer$segm.rec <- factor(basetable_summer$segm.rec, levels=c('>50 days', '31-50 days', '21-30 days', '11-20 days', '6-10 days', '0-5 days'))


basetable_summer$Sex <- factor(basetable_summer$Sex, levels=c("Men","Women"))

summary(basetable_summer)
##    CustomerID      Recency         Frequency      Recency_Transac 
##  Min.   :   1   Min.   :  0.00   Min.   : 1.000   Min.   : -1.00  
##  1st Qu.:1252   1st Qu.:  8.00   1st Qu.: 2.000   1st Qu.: -1.00  
##  Median :2506   Median : 21.00   Median : 4.000   Median : -1.00  
##  Mean   :2502   Mean   : 29.82   Mean   : 4.243   Mean   : 19.77  
##  3rd Qu.:3751   3rd Qu.: 43.00   3rd Qu.: 6.000   3rd Qu.: 32.00  
##  Max.   :5000   Max.   :121.00   Max.   :18.000   Max.   :121.00  
##  Frequency_Transac    Monetary       CustomerType Registrationdate    
##  Min.   :0.000     Min.   :  0.000   1:1225       Min.   :2016-07-01  
##  1st Qu.:0.000     1st Qu.:  0.000   2:1144       1st Qu.:2016-12-07  
##  Median :0.000     Median :  0.000   3:1069       Median :2017-12-16  
##  Mean   :0.473     Mean   :  4.179   4:1279       Mean   :2017-08-10  
##  3rd Qu.:1.000     3rd Qu.:  4.990                3rd Qu.:2018-03-01  
##  Max.   :5.000     Max.   :200.990                Max.   :2018-04-30  
##     Sex            Age          fallbonus          Income     
##  Men  :2815   Min.   : 6.00   Min.   :0.0000   Min.   :1.000  
##  Women:1902   1st Qu.:18.00   1st Qu.:0.0000   1st Qu.:1.000  
##               Median :27.00   Median :0.0000   Median :2.000  
##               Mean   :27.58   Mean   :0.2006   Mean   :2.014  
##               3rd Qu.:33.00   3rd Qu.:0.0000   3rd Qu.:3.000  
##               Max.   :80.00   Max.   :1.0000   Max.   :3.000  
##    Experience       Pokestops           Gyms            Raids       
##  Min.   :  1767   Min.   :  9.00   Min.   :  2.00   Min.   : 0.000  
##  1st Qu.:  9036   1st Qu.: 34.00   1st Qu.: 14.00   1st Qu.: 0.000  
##  Median : 15738   Median : 57.00   Median : 26.00   Median : 1.000  
##  Mean   : 19110   Mean   : 65.93   Mean   : 31.48   Mean   : 3.098  
##  3rd Qu.: 25097   3rd Qu.: 91.00   3rd Qu.: 43.00   3rd Qu.: 4.000  
##  Max.   :104901   Max.   :263.00   Max.   :150.00   Max.   :53.000  
##      Social          Pokemons         Distance          Duration      
##  Min.   : 0.000   Min.   :  4.00   Min.   :  0.383   Min.   :  10.77  
##  1st Qu.: 1.000   1st Qu.: 36.00   1st Qu.:  5.084   1st Qu.:  86.75  
##  Median : 2.000   Median : 66.00   Median : 10.217   Median : 155.45  
##  Mean   : 4.898   Mean   : 78.87   Mean   : 14.969   Mean   : 278.67  
##  3rd Qu.: 5.000   3rd Qu.:107.00   3rd Qu.: 19.762   3rd Qu.: 285.93  
##  Max.   :60.000   Max.   :465.00   Max.   :117.339   Max.   :2353.39  
##      NbSess         NbTrxProd1       NbTrxProd2       NbTrxProd3     
##  Min.   : 1.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 4.000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   : 4.243   Mean   :0.1749   Mean   :0.1556   Mean   :0.08204  
##  3rd Qu.: 6.000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :18.000   Max.   :3.0000   Max.   :3.0000   Max.   :2.00000  
##    NbTrxProd4      NbTrxProd5          NbTrx            clv         
##  Min.   :0.000   Min.   :0.00000   Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:  0.000  
##  Median :0.000   Median :0.00000   Median :0.000   Median :  0.000  
##  Mean   :0.053   Mean   :0.00742   Mean   :0.473   Mean   :  9.587  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.: 11.449  
##  Max.   :2.000   Max.   :2.00000   Max.   :5.000   Max.   :461.138  
##     clv_time      segm.freq       segm.rec  
##  Min.   :  24.7   >5:1243   >50 days  :937  
##  1st Qu.: 199.0   5 : 496   31-50 days:820  
##  Median : 356.6   4 : 654   21-30 days:658  
##  Mean   : 639.4   3 : 830   11-20 days:864  
##  3rd Qu.: 656.0   2 : 831   6-10 days :555  
##  Max.   :5399.5   1 : 663   0-5 days  :883
lcg.product <- summer_trx %>%
            left_join(customer,by="CustomerID")%>% select(CustomerID,ProductID,Value,CustomerType) 

Distribution of product with customer Type (might be usefull for after)

lcg.product2 <- lcg.product %>% 
            group_by(ProductID) %>% 
              summarise(Monetary= sum(Value),Type1= sum(ifelse(CustomerType==1,1,0)),Type2= sum(ifelse(CustomerType==2,1,0)),Type3= sum(ifelse(CustomerType==3,1,0)),Type4= sum(ifelse(CustomerType==4,1,0)))
lcg.product2 

Life cycle grid by quantity , frequency , recency

lcg <- basetable_summer %>%
  group_by(segm.rec, segm.freq) %>%
  summarise(quantity=n(),money= sum(Monetary)) %>%
  mutate(player="player") %>%
  ungroup()
ggplot(lcg, aes(x=player,y=quantity, fill=quantity)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
  geom_text(aes(y=max(quantity)/2, label=round(quantity,0)), size=4) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("Life Cycle Grids - Number of player per Recency & Frequency")

Life cycle grid by monetary , frequency , recency

ggplot(lcg, aes(x=player,y=money, fill=money)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=money), stat='identity', alpha=0.5) +
  geom_text(aes(y=max(money)/2, label=round(money,0)), size=4) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("Life Cycle Grids - Money earned per Recency & Frequency")

Life cycle grid by gender, frequency , recency

lcg.gender <- basetable_summer %>%
  group_by(segm.rec, segm.freq, Sex) %>%
  summarise(quantity=n(),money= sum(Monetary)) %>%
  mutate(player="player") %>%
  ungroup()
ggplot(lcg.gender, aes(x=Sex,y=quantity, fill=Sex)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
  geom_text(aes(y=max(quantity)/2, label=round(quantity,0)), size=4) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("Life Cycle Grids - Gender per Recency & Frequency")

Life cycle grid by gender, frequency , recency

ggplot(lcg.gender, aes(x=Sex,y=money, fill=Sex)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=money), stat='identity', alpha=0.5) +
  geom_text(aes(y=max(money)/2, label=round(money,0)), size=4) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("Life Cycle Grids - Money earned per Gender, Recency & Frequency")

Life cycle grid by customer type and Quantity

lcg.test <- basetable_summer %>%
  group_by(CustomerType, segm.rec, segm.freq) %>%
  # calculating cumulative values
  summarise(quantity=n(),
            money= sum(Monetary)) %>%
  ungroup()
ggplot(lcg.test, aes(x=CustomerType, fill=CustomerType)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
  geom_text(aes(y=quantity, label=round(quantity,0)), size=3) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("LifeCycle Grids - Number of player per Customer type, Recency & Frequency ")

Life cycle grid by customer type and Moneytary value

ggplot(lcg.test, aes(x=CustomerType, fill=CustomerType)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=money), stat='identity', alpha=0.5) +
  geom_text(aes(y=money, label=round(money,0)), size=2) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("LifeCycle Grids - Money earned per Customer type, Recency & Frequency ")

Life cycle grid by customer type, gender and Quantity

lcg.test.gender <- basetable_summer %>%
  group_by(CustomerType, segm.rec, segm.freq,Sex) %>%
  # calculating cumulative values
  summarise(quantity=n(),
            money= sum(Monetary)) %>%
  ungroup()
ggplot(lcg.test.gender, aes(x=CustomerType,y=quantity, fill=Sex)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
  #geom_text(aes(y=quantity, label=round(quantity,0)), size=2) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("Number of player per Customer type, Gender, Recency & Frequency")

Life cycle grid by customer type, gender and Monetary Value

ggplot(lcg.test.gender, aes(x=CustomerType, fill=Sex)) +
  theme_bw() +
  theme(panel.grid = element_blank())+
  geom_bar(aes(y=money), stat='identity', alpha=0.5) +
  #geom_text(aes(y=money, label=round(money,0)), size=2) +
  facet_grid(segm.freq ~ segm.rec) +
  theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
  ggtitle("LifeCycle Grids - Money earned per Customer type, Recency & Frequency ")

~~~~

Assignment 3 (3,5 points) - Churn analysis

~~~~

This assignment focuses on understanding churn after the summer period - i.e., defined as not performing any microtransactions in fall 2018. According to this definition calculate which customers have churned in the fall using the fall financial and usage transaction database.

Update the basetable from assignment 1 with this churn information and check the average churn rate.

Use logistic regression to find the significant factors that affect the churn rate of a customer. Did the fall bonus have any impact on the churn rate? You can add additional independent variables to the basetable as well but please motivate why you would think that they affect churn.

# Status Encoding : 
# 0 = No Sessions
# 1 = Sessions Only
# 2 = Transactions

churn_status <- customer %>% select(CustomerID)
churn_status[churn_status$CustomerID %in% unique(summer_sess$CustomerID),"summer_status"] <- 1
churn_status[churn_status$CustomerID %in% unique(summer_trx$CustomerID),"summer_status"] <- 2
churn_status[churn_status$CustomerID %in% unique(fall_sess$CustomerID),"fall_status"] <- 1
churn_status[churn_status$CustomerID %in% unique(fall_trx$CustomerID),"fall_status"] <- 2
churn_status[is.na(churn_status)] <- 0

# Churn Encoding : 
# 0 = No churn
# 1 = Partial churn (transactions -> sessions only)
# 2 = Total churn (transactions -> no sessions)

# Churn from trx to sess or no sess
churn_status[churn_status$summer_status == 2 & churn_status$fall_status == 2,"churn_trx"] <- 0
churn_status[churn_status$summer_status == 2 & churn_status$fall_status != 2,"churn_trx"] <- 1

# Churn from sess (including trx) to no sess
churn_status[churn_status$summer_status != 0 & churn_status$fall_status != 0,"churn_sess"] <- 0
churn_status[churn_status$summer_status != 0 & churn_status$fall_status == 0,"churn_sess"] <- 1

# Churn from trx to no sess
churn_status[churn_status$summer_status == 2 & churn_status$fall_status != 0,"churn_total"] <- 0
churn_status[churn_status$summer_status == 2 & churn_status$fall_status == 0,"churn_total"] <- 1



churn_status <- churn_status %>% filter(!(summer_status == 0 & fall_status == 0))
churn_status
churn_matrix <- matrix(nrow = 3, ncol = 3)

for(i in 1:3){
  for(j in 1:3){
    value = churn_status %>% 
        filter(summer_status == i-1 & fall_status == j-1) %>% 
        summarise(value = n_distinct(CustomerID))
    
    churn_matrix[i,j] <- value$value
  }
}

churn_matrix <- data.frame(churn_matrix)
rownames(churn_matrix) <- c("Inflow","Summer_sessions","Summer_transactions")
colnames(churn_matrix) <- c("Outflow","Fall_sessions","Fall_transactions")


churn_matrix <- churn_matrix[c("Summer_sessions","Summer_transactions","Inflow"),
                            c("Fall_sessions","Fall_transactions","Outflow")]

churn_matrix <- rbind(churn_matrix, data.frame(as.list(colSums(churn_matrix))))
churn_matrix <- cbind(churn_matrix,t(data.frame(as.list(rowSums(churn_matrix)))))
colnames(churn_matrix) <- c(colnames(churn_matrix)[-4],"Total")
rownames(churn_matrix) <- c(rownames(churn_matrix)[-4],"Total")


churn_matrix

We are going to update the basetable with the churn information

basetable_churn <- basetable_summer %>% left_join(churn_status, by='CustomerID') %>% 
                                        select(-c("NbTrx","NbSess","summer_status","fall_status"))

basetable_churn <- basetable_churn %>%
  mutate(Type1= ifelse(CustomerType==1,1,0), Type2 =ifelse(CustomerType==2,1,0),Type3 =ifelse(CustomerType==3,1,0) ) %>%
  select(-c("CustomerType","segm.freq","segm.rec"))

basetable_churn_trx <- basetable_churn[!is.na(basetable_churn$churn_trx),] %>% select(-c("churn_sess","churn_total"))
basetable_churn_sess <- basetable_churn[!is.na(basetable_churn$churn_sess),] %>% select(-c("churn_trx","churn_total"))
basetable_churn_total <- basetable_churn[!is.na(basetable_churn$churn_total),] %>% select(-c("churn_trx","churn_sess"))
basetable_churn_total
basetable_churn_trx %>% summarise(mean(churn_trx))

We have 66% of churn in the database it is HUGE !

Log Reg 1 : churn_trx

Let’s split the dataset to perform our logisitic regression

set.seed(654)
split <- sample.split(basetable_churn_trx$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_trx, split == TRUE)
validation <- subset(basetable_churn_trx, split == FALSE)

modeltraining = glm(churn_trx~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
## 
## Call:
## glm(formula = churn_trx ~ ., family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1489  -1.0835   0.6986   0.8569   1.6772  
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.550e+00  6.089e+00  -0.911   0.3620    
## CustomerID        -4.051e-05  4.565e-05  -0.887   0.3748    
## Recency            3.277e-03  2.891e-03   1.133   0.2571    
## Frequency         -1.692e-01  2.053e-01  -0.824   0.4099    
## Recency_Transac   -2.838e-03  1.941e-03  -1.463   0.1436    
## Frequency_Transac  2.683e-01  3.068e-01   0.874   0.3819    
## Monetary          -1.653e-03  6.719e-03  -0.246   0.8057    
## Registrationdate   4.013e-04  3.443e-04   1.165   0.2439    
## SexWomen          -9.028e-02  1.329e-01  -0.679   0.4969    
## Age               -1.619e-03  4.810e-03  -0.336   0.7365    
## fallbonus         -1.441e+00  1.564e-01  -9.214   <2e-16 ***
## Income            -5.358e-02  7.938e-02  -0.675   0.4997    
## Experience        -2.178e-01  1.012e-01  -2.152   0.0314 *  
## Pokestops          1.090e+01  5.062e+00   2.153   0.0313 *  
## Gyms               4.356e+01  2.024e+01   2.152   0.0314 *  
## Raids              2.178e+02  1.012e+02   2.152   0.0314 *  
## Social             4.357e+01  2.024e+01   2.152   0.0314 *  
## Pokemons           1.089e+01  5.061e+00   2.151   0.0315 *  
## Distance           2.179e+01  1.012e+01   2.152   0.0314 *  
## Duration          -1.366e-04  6.576e-04  -0.208   0.8355    
## NbTrxProd1        -1.012e-01  2.877e-01  -0.352   0.7251    
## NbTrxProd2        -3.116e-01  2.789e-01  -1.117   0.2639    
## NbTrxProd3        -2.688e-01  2.696e-01  -0.997   0.3188    
## NbTrxProd4                NA         NA      NA       NA    
## NbTrxProd5                NA         NA      NA       NA    
## clv                       NA         NA      NA       NA    
## clv_time                  NA         NA      NA       NA    
## Type1             -1.027e-01  3.300e-01  -0.311   0.7556    
## Type2              6.630e-02  3.024e-01   0.219   0.8265    
## Type3              8.713e-01  3.864e-01   2.255   0.0241 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1534.7  on 1196  degrees of freedom
## Residual deviance: 1406.3  on 1171  degrees of freedom
## AIC: 1458.3
## 
## Number of Fisher Scoring iterations: 4

We only keep significant variables thanks to the stepwise selection

#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
## 
## Call:
## glm(formula = churn_trx ~ fallbonus + Experience + Type3, family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0147  -1.0446   0.7517   0.8456   1.6397  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.211e+00  1.246e-01   9.716  < 2e-16 ***
## fallbonus   -1.389e+00  1.528e-01  -9.095  < 2e-16 ***
## Experience  -1.975e-05  5.139e-06  -3.844 0.000121 ***
## Type3        7.673e-01  1.788e-01   4.292 1.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1534.7  on 1196  degrees of freedom
## Residual deviance: 1426.3  on 1193  degrees of freedom
## AIC: 1434.3
## 
## Number of Fisher Scoring iterations: 4

We need to find the best treshhold to get the highest PCC

pred <- predict(model.backward, type = "response", newdata = validation)
validation$prob_churn <- pred

# Using probability cutoff of 67%.

pred_churn <- factor(ifelse(pred >= 0.67, "Yes", "No"))
actual_churn <- factor(ifelse(validation$churn==1,"Yes","No"))
table(actual_churn,pred_churn)
##             pred_churn
## actual_churn  No Yes
##          No   80  99
##          Yes  91 244
PCC = mean(actual_churn==pred_churn)
PCC
## [1] 0.6303502

LogReg 2 : churn_sess

Let’s split the dataset to perform our logisitic regression

set.seed(654)
split <- sample.split(basetable_churn_sess$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_sess, split == TRUE)
validation <- subset(basetable_churn_sess, split == FALSE)

modeltraining = glm(churn_sess~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
## 
## Call:
## glm(formula = churn_sess ~ ., family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2182  -0.8128  -0.6336   1.2221   2.6356  
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)       -7.931e+00  3.757e+00  -2.111  0.03479 * 
## CustomerID         5.859e-06  2.845e-05   0.206  0.83685   
## Recency           -1.579e-03  1.634e-03  -0.966  0.33394   
## Frequency          7.651e-02  1.440e-01   0.531  0.59507   
## Recency_Transac   -3.411e-03  1.774e-03  -1.923  0.05448 . 
## Frequency_Transac -8.907e-01  3.780e-01  -2.356  0.01845 * 
## Monetary           1.291e-02  7.881e-03   1.638  0.10145   
## Registrationdate   4.294e-04  2.127e-04   2.018  0.04355 * 
## SexWomen          -5.999e-02  8.419e-02  -0.712  0.47618   
## Age               -4.236e-03  3.144e-03  -1.347  0.17791   
## fallbonus         -2.368e-01  1.058e-01  -2.237  0.02527 * 
## Income            -5.754e-02  5.032e-02  -1.143  0.25283   
## Experience         1.486e-02  7.175e-02   0.207  0.83590   
## Pokestops         -7.630e-01  3.587e+00  -0.213  0.83157   
## Gyms              -2.975e+00  1.435e+01  -0.207  0.83578   
## Raids             -1.485e+01  7.174e+01  -0.207  0.83607   
## Social            -2.954e+00  1.435e+01  -0.206  0.83690   
## Pokemons          -7.346e-01  3.587e+00  -0.205  0.83775   
## Distance          -1.474e+00  7.174e+00  -0.205  0.83725   
## Duration          -3.359e-04  4.803e-04  -0.699  0.48436   
## NbTrxProd1         6.997e-01  3.703e-01   1.889  0.05886 . 
## NbTrxProd2         4.781e-01  3.620e-01   1.321  0.18661   
## NbTrxProd3         5.341e-01  3.559e-01   1.501  0.13343   
## NbTrxProd4                NA         NA      NA       NA   
## NbTrxProd5                NA         NA      NA       NA   
## clv                       NA         NA      NA       NA   
## clv_time                  NA         NA      NA       NA   
## Type1              5.842e-02  2.015e-01   0.290  0.77191   
## Type2              5.258e-01  1.753e-01   2.999  0.00271 **
## Type3             -6.394e-01  2.466e-01  -2.593  0.00952 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3742.4  on 3300  degrees of freedom
## Residual deviance: 3553.5  on 3275  degrees of freedom
## AIC: 3605.5
## 
## Number of Fisher Scoring iterations: 4

We only keep significant variables thanks to the stepwise selection

#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
## 
## Call:
## glm(formula = churn_sess ~ Frequency_Transac + Type2 + Type3, 
##     family = binomial(link = "logit"), data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0029  -0.8179  -0.6865   1.3625   2.5390  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.92324    0.05967 -15.473  < 2e-16 ***
## Frequency_Transac -0.40221    0.06726  -5.980 2.23e-09 ***
## Type2              0.49790    0.09272   5.370 7.88e-08 ***
## Type3             -0.65044    0.11972  -5.433 5.54e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3742.4  on 3300  degrees of freedom
## Residual deviance: 3598.5  on 3297  degrees of freedom
## AIC: 3606.5
## 
## Number of Fisher Scoring iterations: 4

LogReg 3 : churn_total

Let’s split the dataset to perform our logisitic regression

set.seed(654)
split <- sample.split(basetable_churn_total$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_total, split == TRUE)
validation <- subset(basetable_churn_total, split == FALSE)

modeltraining = glm(churn_total~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
## 
## Call:
## glm(formula = churn_total ~ ., family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0945  -0.6843  -0.5245  -0.3346   2.6329  
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.298e+01  7.227e+00  -1.797   0.0724 .  
## CustomerID        -7.309e-05  5.587e-05  -1.308   0.1908    
## Recency            1.891e-03  3.171e-03   0.596   0.5510    
## Frequency          2.635e-01  2.656e-01   0.992   0.3211    
## Recency_Transac    1.453e-03  2.334e-03   0.622   0.5337    
## Frequency_Transac -4.677e-02  3.732e-01  -0.125   0.9003    
## Monetary           1.741e-03  7.851e-03   0.222   0.8245    
## Registrationdate   6.762e-04  4.087e-04   1.655   0.0980 .  
## SexWomen          -6.845e-05  1.605e-01   0.000   0.9997    
## Age               -7.137e-04  5.977e-03  -0.119   0.9050    
## fallbonus         -1.140e+00  2.578e-01  -4.421  9.8e-06 ***
## Income            -1.411e-02  9.490e-02  -0.149   0.8818    
## Experience         6.512e-02  1.306e-01   0.498   0.6181    
## Pokestops         -3.280e+00  6.533e+00  -0.502   0.6156    
## Gyms              -1.304e+01  2.613e+01  -0.499   0.6176    
## Raids             -6.509e+01  1.306e+02  -0.498   0.6183    
## Social            -1.302e+01  2.613e+01  -0.498   0.6183    
## Pokemons          -3.246e+00  6.532e+00  -0.497   0.6192    
## Distance          -6.499e+00  1.306e+01  -0.497   0.6188    
## Duration          -1.107e-03  8.335e-04  -1.329   0.1840    
## NbTrxProd1         1.962e-01  3.441e-01   0.570   0.5686    
## NbTrxProd2         5.797e-02  3.367e-01   0.172   0.8633    
## NbTrxProd3         6.322e-02  3.297e-01   0.192   0.8480    
## NbTrxProd4                NA         NA      NA       NA    
## NbTrxProd5                NA         NA      NA       NA    
## clv                       NA         NA      NA       NA    
## clv_time                  NA         NA      NA       NA    
## Type1              3.291e-01  3.910e-01   0.842   0.4000    
## Type2              4.867e-01  3.503e-01   1.389   0.1648    
## Type3             -8.553e-02  4.597e-01  -0.186   0.8524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1121.0  on 1196  degrees of freedom
## Residual deviance: 1050.6  on 1171  degrees of freedom
## AIC: 1102.6
## 
## Number of Fisher Scoring iterations: 5

We only keep significant variables thanks to the stepwise selection

#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
## 
## Call:
## glm(formula = churn_total ~ fallbonus + Gyms + Type2, family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9692  -0.6795  -0.5614  -0.3643   2.4906  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.000920   0.164362  -6.090 1.13e-09 ***
## fallbonus   -1.142814   0.255289  -4.477 7.59e-06 ***
## Gyms        -0.014471   0.003889  -3.721 0.000198 ***
## Type2        0.518296   0.180561   2.870 0.004099 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1121.0  on 1196  degrees of freedom
## Residual deviance: 1066.4  on 1193  degrees of freedom
## AIC: 1074.4
## 
## Number of Fisher Scoring iterations: 5